import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
#import matplotlib as mpl
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
import pandas_profiling
import math
import seaborn as sns
import gc
from datetime import datetime
from pygeocoder import Geocoder
pd.set_option('display.max_columns', None)
def currency(x, pos):
'The two args are the value and tick position'
if x >= 1000000:
return '${:1.1f}M'.format(x*1e-6)
elif x >= 1000000:
return '${:1.1f}K'.format(x*1e-3)
return '${:1.0f}'.format(x*1e-0)
def number(x, pos):
'The two args are the value and tick position'
if x >= 1000000:
return '{:1.1f}M'.format(x*1e-6)
elif x >= 1000000:
return '{:1.1f}K'.format(x*1e-3)
return '{:1.0f}'.format(x*1e-0)
#numRows = 50000 # 50 thousand
#numRows = 500000 # 500 thousand
#numRows = 1000000 # 1 million
numRows = 1000000000 # 1 billion
dt = "Jan 2013"
fileMonth = "1"
my_colors =[
'#1F77B4', '#AEC7E8', '#FF7F0E', '#FFBB78', '#2CA02C', '#98DF8A',
'#D62728', '#FF9896', '#9467BD', '#C5B0D5', '#8C564B', '#C49C94',
'#E377C2', '#F7B6D2', '#7F7F7F', '#C7C7C7', '#BCBD22', '#DBDB8D',
'#17BECF', '#9EDAE5'
]
#set the fonts that are on the box so as to stop the warning messages
# specify the custom font to use
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'DejaVu Sans'
tripData1 = pd.read_csv('/dltraining/datasets/Trip Data/trip_data_' + fileMonth + '.csv', nrows = numRows)
tripData1.head()
# work out the min and max dates of the pickup time.
minPickup=min(tripData1['pickup_datetime'])
maxPickup=max(tripData1['pickup_datetime'])
print('MinPickup: %s, Max Pickup: %s' % (minPickup, maxPickup))
#convert to datetime
tripData1.loc[:, 'pickup_datetime_dt'] = pd.to_datetime(tripData1.loc[:, 'pickup_datetime'])
tripData1.loc[:, 'dropoff_datetime_dt'] = pd.to_datetime(tripData1.loc[:, 'dropoff_datetime'])
tripFare1 = pd.read_csv('/dltraining/datasets/Trip Fare/trip_fare_' + fileMonth + '.csv', nrows = numRows)
#remove the leading spaces
tripFare1.columns = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime',
'payment_type', 'fare_amount', 'surcharge', 'mta_tax',
'tip_amount', 'tolls_amount', 'total_amount']
tripFare1.head()
#convert to datetime
tripFare1.loc[:, 'pickup_datetime_dt'] = pd.to_datetime(tripFare1.loc[:, 'pickup_datetime'])
if numRows < 100000: pandas_profiling.ProfileReport(tripFare1)
if numRows < 100000: pandas_profiling.ProfileReport(tripData1)
Wanting to only include valid trips within the New York Region
#Drop rows with long / lat data that is either in correct or out of range that we are interested in
print('Old size: %d' % len(tripData1))
tripData1 = tripData1[((tripData1.pickup_longitude<=-71.000000) & (tripData1.pickup_longitude>=-75.000000)) &\
((tripData1.pickup_latitude>=39.900000) & (tripData1.pickup_latitude<=42.000000)) &\
((tripData1.dropoff_longitude<=-71.000000) & (tripData1.dropoff_longitude>=-75.000000)) &\
((tripData1.dropoff_latitude>=39.900000) & (tripData1.dropoff_latitude<=42.000000))]
print('New size: %d' % len(tripData1))
tripData1.dtypes
# minimum and maximum longitude
min(tripData1.pickup_longitude.min(), tripData1.dropoff_longitude.min()), \
max(tripData1.pickup_longitude.max(), tripData1.dropoff_longitude.max())
# minimum and maximum latitude
min(tripData1.pickup_latitude.min(), tripData1.dropoff_latitude.min()), \
max(tripData1.pickup_latitude.max(), tripData1.dropoff_latitude.max())
tripDataFare = pd.merge(tripData1, tripFare1, how='left', left_on = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'pickup_datetime_dt'], right_on = ['medallion', 'hack_license', 'vendor_id', 'pickup_datetime', 'pickup_datetime_dt'] )
# Delete the objects used to load the data as we now have a single file so don't need the old ones.
del tripData1, tripFare1
tripDataFare['trip_time_in_min'] = (tripDataFare['trip_time_in_secs']/60)
tripDataFare.columns
# Convert longitude and latitude to a location
#results = Geocoder.reverse_geocode(tripDataFare['pickup_latitude'][0], tripDataFare['pickup_longitude'][0])
def weekDayName(i):
switcher={
0:'Monday',
1:'Tuesday',
2:'Wednesday',
3:'Thursday',
4:'Friday',
5:'Saturday',
6:'Sunday'
}
return str(switcher.get(i,"Invalid day of week"))
def weekdayWeekend(i):
switcher={
0:'Weekday',
1:'Weekday',
2:'Weekday',
3:'Weekday',
4:'Weekday',
5:'Weekend',
6:'Weekend'
}
return str(switcher.get(i,"Invalid day of week"))
# create a function to return a value for the time of day it is
# these values are arbitary and need reviewing
def timeOfDay(i):
switcher={
0:'Night',
1:'Night',
2:'Night',
3:'Night',
4:'Night',
5:'Night',
6:'Morning Rush Hour',
7:'Morning Rush Hour',
8:'Morning Rush Hour',
9:'Morning Rush Hour',
10:'Late morning',
11:'Late morning',
12:'Late morning',
13:'Early afternoon',
14:'Early afternoon',
15:'Early afternoon',
16:'Evening Rush Hour',
17:'Evening Rush Hour',
18:'Evening Rush Hour',
19:'Evening Rush Hour',
20:'Evening',
21:'Evening',
22:'Evening',
23:'Evening'
}
return switcher.get(i,"Invalid Time")
# add date dimensional data for reporting reasons
tripDataFare["pickup_day"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.day)
tripDataFare["pickup_month"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.month)
tripDataFare["pickup_year"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.year)
tripDataFare["pickup_date"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.date())
tripDataFare["pickup_weekday"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.weekday()) # Monday = 0, Sunday = 6
tripDataFare["pickup_isoweekday"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.isoweekday()) # Monday = 1, Sunday = 7
# add time dimensional data for reporting reasons
tripDataFare["pickup_hour"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.hour)
tripDataFare["pickup_minute"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.minute)
tripDataFare["pickup_time"] = tripDataFare['pickup_datetime_dt'].map(lambda x: x.time())
# Convert the day number into a day of the week
tripDataFare['pickup_dayName'] = tripDataFare["pickup_weekday"].map(lambda x: weekDayName(x))
# Convert the day number into a day of the week
tripDataFare['pickup_weekdayWeekend'] = tripDataFare["pickup_weekday"].map(lambda x: weekdayWeekend(x))
tripDataFare['pickup_timeOfDay'] = tripDataFare['pickup_hour'].map(lambda x: timeOfDay(x))
# add date dimensional data for reporting reasons
tripDataFare["dropoff_day"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.day)
tripDataFare["dropoff_month"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.month)
tripDataFare["dropoff_year"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.year)
tripDataFare["dropoff_date"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.date())
tripDataFare["dropoff_weekday"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.weekday()) # Monday = 0, Sunday = 6
tripDataFare["dropoff_isoweekday"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.isoweekday()) # Monday = 1, Sunday = 7
# add time dimensional data for reporting reasons
tripDataFare["dropoff_hour"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.hour)
tripDataFare["dropoff_minute"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.minute)
tripDataFare["dropoff_time"] = tripDataFare['dropoff_datetime_dt'].map(lambda x: x.time())
# Convert the day number into a day of the week
tripDataFare['dropoff_dayName'] = tripDataFare["dropoff_weekday"].map(lambda x: weekDayName(x))
# Convert the day number into a day of the week
tripDataFare['dropoff_weekdayWeekend'] = tripDataFare["dropoff_weekday"].map(lambda x: weekdayWeekend(x))
tripDataFare['dropoff_timeOfDay'] = tripDataFare['dropoff_hour'].map(lambda x: timeOfDay(x))
def tripLength(i):
if i <= 0: # catches 0 and neg numbers as errors
result = "Error"
elif i > 0 and i <= 1:
result = "up to 1 mile"
elif i > 1 and i <= 5:
result = "1 to 5 miles"
elif i > 6 and i <= 10:
result = "6 to 10 miles"
elif i > 10 and i <= 20:
result = "10 to 20 miles"
else:
result = "20 + miles"
return result
def tripDuration(i):
if i <= 0: # catches 0 and neg numbers as errors
result = "Error"
elif i > 0 and i <= 300:
result = "up to 5 minutes"
elif i > 300 and i <= 600:
result = "5 to 10 minutes"
elif i > 600 and i <= 900:
result = "10 to 15 minutes"
elif i > 900 and i <= 1200:
result = "15 to 20 minutes"
else:
result = "20 + minutes"
return result
# Convert the trip distance in a categorical variable
tripDataFare['tripLength'] = tripDataFare["trip_distance"].map(lambda x: tripLength(x))
tripDataFare[['medallion','tripLength']].groupby('tripLength').count()
# Convert the trip duration in a categorical variable
tripDataFare['tripDuration'] = tripDataFare["trip_time_in_secs"].map(lambda x: tripDuration(x))
tripDataFare[['medallion','tripDuration']].groupby('tripDuration').count()
How are Revenue, Trips, Passengers tracking over time
title_font = { 'size':'18', 'color':'black', 'weight':'normal',
'verticalalignment':'bottom'} # Bottom vertical alignment for more space
print(tripDataFare[['pickup_date', 'passenger_count']].shape)
print(tripDataFare[['pickup_date', 'passenger_count']].head())
print(tripDataFare[['pickup_date', 'passenger_count']].tail())
# Calc the number of journeys per day
visDataTrips = tripDataFare[['pickup_date','passenger_count']].groupby('pickup_date', as_index=False).count()
# sort the dates
visDataTrips = visDataTrips.sort_values(by=['pickup_date'])
visDataTrips.columns = ['Pickup Date', '# of Trips']
visDataTrips.head()
#plot
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Number of Taxi Trips Made per Day for ' + dt, title_font)
visDataTrips.plot(kind='line', x="Pickup Date", y="# of Trips", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Total Revenue')
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
if dt == "Jan 2013":
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "New Years Day")
plt.show()
ax.get_ylim()[1] /10
# Calc the number of passengers moved per day
visDataPassenger = tripDataFare.groupby(['pickup_date'], as_index=False)['passenger_count'].agg('sum')
# sort the dates
visDataPassenger = visDataPassenger.sort_values(by=['pickup_date'])
visDataPassenger.columns = ['Pickup Date', '# of Passengers']
visDataPassenger.head()
#plot
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Number of People Moved per Day for ' + dt, title_font)
visDataPassenger.plot(kind='line', x="Pickup Date", y="# of Passengers", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Total Revenue')
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
if dt == "Jan 2013":
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "New Years Day")
plt.show()
# Calc the number of passengers moved per day
visDataFareRevenue = tripDataFare.groupby(['pickup_date'], as_index=False)['fare_amount'].agg('sum')
# sort the dates
visDataFareRevenue = visDataFareRevenue.sort_values(by=['pickup_date'])
visDataFareRevenue.columns = ['Pickup Date', 'Fare Revenue']
visDataFareRevenue.head()
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Total Fare Revenue per Day for ' + dt, title_font)
visDataFareRevenue.plot(kind='line', x="Pickup Date", y="Fare Revenue", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Fare Revenue')
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
if dt == "Jan 2013":
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "New Years Day")
plt.show()
# Calc the number of passengers moved per day
visDataRevenue = tripDataFare.groupby(['pickup_date'], as_index=False)['total_amount'].agg('sum')
# sort the dates
visDataRevenue = visDataRevenue.sort_values(by=['pickup_date'])
visDataRevenue.columns = ['Pickup Date', 'Total Revenue']
visDataRevenue.head()
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Total Revenue per Day for ' + dt, title_font)
visDataRevenue.plot(kind='line', x="Pickup Date", y="Total Revenue", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Total Revenue')
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
if dt == "Jan 2013":
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "New Years Day")
plt.show()
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Total Fare & Fare Revenue per Day for ' + dt, title_font)
visDataRevenue.plot(kind='line', x="Pickup Date", y="Total Revenue", ax=ax)
visDataFareRevenue.plot(kind='line', x="Pickup Date", y="Fare Revenue", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Revenue')
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10 , "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10 , "New Years Day")
plt.show()
#plot
fig, ax = plt.subplots(figsize=(20,5))
ax.set_title('Number of Taxi Trips Made per Day for ' + dt, title_font)
visDataTrips.plot(kind='line', x="Pickup Date", y="# of Trips", ax=ax)
visDataPassenger.plot(kind='line', x="Pickup Date", y="# of Passengers", ax=ax)
ax.set( xlabel='Pickup Date', ylabel='Total Revenue')
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
if dt == "Jan 2013":
ax.text('2013-01-22', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "Martin Luther King, Jr.'s Birthday")
ax.text('2013-01-01', ax.get_ylim()[0] + ax.get_ylim()[1] /10, "New Years Day")
plt.show()
visDataTripsPassengers = pd.merge(visDataTrips, visDataPassenger, how='left', left_on = ['Pickup Date'], right_on = ['Pickup Date'])
visDataTripsPassengers.head()
#plot
fig, ax = plt.subplots(figsize=(10,10))
ax.set_title('Scatter of Taxi Trips Made per Day for ' + dt, title_font)
visDataTripsPassengers.plot(kind='scatter', x='# of Passengers', y='# of Trips', s=20, ax=ax)
for line in range(0,visDataTripsPassengers.shape[0]):
ax.text(visDataTripsPassengers['# of Passengers'][line]+900, visDataTripsPassengers['# of Trips'][line]-4000, \
visDataTripsPassengers['Pickup Date'][line], horizontalalignment='left', size='small', color='black')
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
visDataTripsPassengersRevenue = pd.merge(visDataTripsPassengers, visDataRevenue, how='left', left_on = ['Pickup Date'], right_on = ['Pickup Date'])
visDataTripsPassengersRevenue.head()
#plot
fig, ax = plt.subplots(figsize=(10,10))
ax.set_title('Scatter Plot of Taxi Trips Made per Day for ' + dt, title_font)
visDataTripsPassengersRevenue.plot(kind='scatter', x='# of Trips', y='Total Revenue', s=20 , ax=ax)
for line in range(0,visDataTripsPassengers.shape[0]):
ax.text(visDataTripsPassengersRevenue['# of Trips'][line]+900, visDataTripsPassengersRevenue['Total Revenue'][line]-4000, \
visDataTripsPassengersRevenue['Pickup Date'][line], horizontalalignment='left', size='small', color='black')
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
formatter = FuncFormatter(number)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
visDataTripsPassengersRevenueFareRevenue = pd.merge(visDataTripsPassengersRevenue, visDataFareRevenue, how='left', left_on = ['Pickup Date'], right_on = ['Pickup Date'])
visDataTripsPassengersRevenueFareRevenue.head()
#plot
fig, ax = plt.subplots(figsize=(10,10))
ax.set_title('Scatter Plot of Fare Revenue and Total Fare Revenue per Day for ' + dt, title_font)
visDataTripsPassengersRevenueFareRevenue.plot(kind='scatter', x='Fare Revenue', y='Total Revenue', s=20 , ax=ax)
for line in range(0,visDataTripsPassengers.shape[0]):
ax.text(visDataTripsPassengersRevenueFareRevenue['Fare Revenue'][line]+900, visDataTripsPassengersRevenueFareRevenue['Total Revenue'][line]-4000, \
visDataTripsPassengersRevenueFareRevenue['Pickup Date'][line], horizontalalignment='left', size='small', color='black')
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
formatter = FuncFormatter(number)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
def plt_clock(ax, radii, title, color):
N = 24
bottom = 2
# create theta for 24 hours
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
# width of each bin on the plot
width = (2*np.pi) / N
bars = ax.bar(theta, radii, width=width, bottom=bottom, color=color, edgecolor="#999999")
# set the lable go clockwise and start from the top
ax.set_theta_zero_location("N")
# clockwise
ax.set_theta_direction(-1)
# set the label
ax.set_xticks(theta)
ticks = ["{}:00".format(x) for x in range(24)]
ax.set_xticklabels(ticks)
ax.set_title(title)
def tripVis (tripLengthBoundary = 10):
visData = tripDataFare[['pickup_hour', 'dropoff_hour']][tripDataFare['trip_distance'] < tripLengthBoundary]
visData['count'] = 1
df_q3_short = visData.groupby(['pickup_hour', 'dropoff_hour'],as_index=False).count()
df_q3_short.columns = ['PUtime', 'DOtime', 'count']
#df_q3_short.head()
visData = tripDataFare[['pickup_hour', 'dropoff_hour']][tripDataFare['trip_distance'] >= tripLengthBoundary]
visData['count'] = 1
df_q3_long = visData.groupby(['pickup_hour', 'dropoff_hour'],as_index=False).count()
df_q3_long.columns = ['PUtime', 'DOtime', 'count']
#df_q3_long.head()
df_q3 = df_q3_short.merge(df_q3_long, how='left', on=["PUtime", "DOtime"], suffixes=["_short", "_long"]) \
.rename(columns={"count_short":"short trips", "count_long":"long trips", \
"PUtime":"pickup time", "DOtime":"dropoff time"})
#create pickup time dataframe
df_q3_PU = df_q3.groupby(["pickup time"], as_index=False) \
.agg({'short trips': 'sum', 'long trips':'sum'}) \
.sort_values(by="pickup time")
#df_q3_PU.head()
#create dropoff time dataframe
df_q3_DO = df_q3.groupby(["dropoff time"], as_index=False) \
.agg({'short trips': 'sum', 'long trips':'sum'}) \
.sort_values(by="dropoff time")
#df_q3_DO.head()
# diff_short_long_trip_on_time()
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(18,18))
ax = plt.subplot(2,2,1, polar=True)
# make the histogram that bined on 24 hour
radii = np.array(df_q3_PU['short trips'].tolist(), dtype="int64")
title = "Pickup Time for Short Trips"
plt_clock(ax, radii, title, "#dc143c")
ax = plt.subplot(2,2,2, polar=True)
# make the histogram that bined on 24 hour
radii = np.array(df_q3_PU['long trips'].tolist(), dtype="int64")
title = "Pickup Time for Long Trips"
plt_clock(ax, radii, title, "#56B4E9")
ax = plt.subplot(2,2,3, polar=True)
# make the histogram that bined on 24 hour
radii = np.array(df_q3_DO['short trips'].tolist(), dtype="int64")
title = "Dropoff Time for Short Trips"
plt_clock(ax, radii, title, "#dc143c")
ax = plt.subplot(2,2,4, polar=True)
# make the histogram that bined on 24 hour
radii = np.array(df_q3_DO['long trips'].tolist(), dtype="int64")
title = "Dropoff Time for Long Trips"
plt_clock(ax, radii, title, "#56B4E9")
tripVis(5)
tripVis(10)
tripVis(20)
tripVis(30)
tripVis(100)
tripVis(200)
del tripVis
gc.collect()
How are Passengers per trip, payment type, fare and tip amounts distributed?
def retOrderMapping(field):
if field == 'tripLength':
return tripLengthOrderMapping
elif field == 'tripDuration':
return tripDurationMapping
elif field == 'pickup_weekdayWeekend':
return weekdayWeekendMapping
elif field == 'pickup_dayName':
return dayNameMapping
elif field == 'vendor_id':
return vendor_idOrderMapping
elif field == 'payment_type':
return payment_typeOrderMapping
elif field == 'Taxi Use':
return taxiUseOrderMapping
elif field == 'pickup_timeOfDay':
return pickup_timeOfDayOrderMapping
tripLengthOrder = ['up to 1 mile', '1 to 5 miles', '6 to 10 miles', '10 to 20 miles', '20 + miles', 'Error']
# this creates some value key pairs in the correct order
tripLengthOrderMapping = {trip: i for i, trip in enumerate(tripLengthOrder)}
del tripLengthOrder
payment_typeOrder = ['CSH', 'CRD', 'DIS', 'NOC', 'UNK']
# this creates some value key pairs in the correct order
payment_typeOrderMapping = {trip: i for i, trip in enumerate(payment_typeOrder)}
del payment_typeOrder
vendor_idOrder = ['CMT', 'VTS']
# this creates some value key pairs in the correct order
vendor_idOrderMapping = {trip: i for i, trip in enumerate(vendor_idOrder)}
del vendor_idOrder
taxiUseOrder = ['Single Use', 'Low', 'Medium', 'High', 'Extreme', 'Error']
# this creates some value key pairs in the correct order
taxiUseOrderMapping = {trip: i for i, trip in enumerate(taxiUseOrder)}
del taxiUseOrder
pickup_timeOfDayOrder = ['Morning Rush Hour', 'Late morning', 'Early afternoon', 'Evening Rush Hour', 'Evening', 'Night', 'Error' ]
# this creates some value key pairs in the correct order
pickup_timeOfDayOrderMapping = {trip: i for i, trip in enumerate(pickup_timeOfDayOrder)}
del pickup_timeOfDayOrder
weekdayWeekend = ['Weekday', 'Weekend']
# this creates some value key pairs in the correct order
weekdayWeekendMapping = {trip: i for i, trip in enumerate(weekdayWeekend)}
del weekdayWeekend
tripDuration = ['up to 5 minutes', '5 to 10 minutes', '10 to 15 minutes', '15 to 20 minutes', '20 + minutes', 'Error']
# this creates some value key pairs in the correct order
tripDurationMapping = {trip: i for i, trip in enumerate(tripDuration)}
del tripDuration
dayName = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# this creates some value key pairs in the correct order
dayNameMapping = {trip: i for i, trip in enumerate(dayName)}
del dayName
# free memory
gc.collect()
# create a df to use for visualising
visDataQ3 = tripDataFare[['pickup_date','passenger_count', 'trip_distance', 'rate_code',
'store_and_fwd_flag', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount',
'tolls_amount', 'total_amount', 'pickup_dayName', 'pickup_weekdayWeekend', 'tripLength',
'tripDuration']]
visDataQ3.head()
# define which order we are using
orderMapping = tripLengthOrderMapping
byField = 'tripLength'
countField = 'passenger_count'
# Calc the number of journeys per day
visQ3 = visDataQ3[[byField,countField]].groupby(byField, as_index=False)[countField].agg('sum')
visQ3.head()
# create the sort key
key = visQ3[byField].map(orderMapping)
# reorganise the dataframe to the correct sort order
visQ3 = visQ3.iloc[key.argsort()]
visQ3.head()
#plot
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Passenger Count by Trip Length for ' + dt, title_font)
visQ3.plot(kind='bar', x=byField, y=countField, ax=ax)
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
plt.show()
# define which order we are using
orderMapping = tripLengthOrderMapping
byField = 'tripLength'
countField = 'total_amount'
# Calc the number of journeys per day
visQ3 = visDataQ3[[byField,countField]].groupby(byField, as_index=False)[countField].agg('sum')
visQ3.head()
# create the sort key
key = visQ3[byField].map(orderMapping)
# reorganise the dataframe to the correct sort order
visQ3 = visQ3.iloc[key.argsort()]
visQ3.head()
#plot
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Total Revenue by Trip Length for ' + dt, title_font)
visQ3.plot(kind='bar', x=byField, y=countField, ax=ax)
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
plt.show()
# define which order we are using
orderMapping = tripDurationMapping
byField = 'tripDuration'
countField = 'passenger_count'
# Calc the number of journeys per day
visQ3 = visDataQ3[[byField,countField]].groupby(byField, as_index=False)[countField].agg('sum')
visQ3
visQ3['tripDuration']
# create the sort key
key = visQ3['tripDuration'].map(orderMapping)
# reorganise the dataframe to the correct sort order
visQ3 = visQ3.iloc[key.argsort()]
visQ3.head()
#plot
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Passenger Count by Trip Length for ' + dt, title_font)
visQ3.plot(kind='bar', x=byField, y=countField, ax=ax)
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
plt.show()
# define which order we are using
orderMapping = tripDurationMapping
byField = 'tripDuration'
countField = 'total_amount'
# Calc the number of journeys per day
visQ3 = visDataQ3[[byField,countField]].groupby(byField, as_index=False)[countField].agg('sum')
visQ3
# create the sort key
key = visQ3['tripDuration'].map(orderMapping)
# reorganise the dataframe to the correct sort order
visQ3 = visQ3.iloc[key.argsort()]
visQ3.head()
#plot
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Revenue by Trip Length for ' + dt, title_font)
visQ3.plot(kind='bar', x=byField, y=countField, ax=ax)
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
plt.show()
#clean up
del visDataQ3, visQ3
gc.collect()
def taxiUse(i):
# look at the number of taxis they use and categorise on that
if i <= 0: # catches 0 and neg numbers as errors
result = "Error"
elif i > 0 and i <= 1:
result = "Single Use"
elif i > 1 and i <= 5:
result = "Low"
elif i > 6 and i <= 10:
result = "Medium"
elif i > 10 and i <= 20:
result = "High"
else:
result = "Extreme"
return result
driverVehicle = tripDataFare[['medallion', 'hack_license', 'fare_amount', 'tip_amount', 'total_amount', 'trip_time_in_secs']]
driverVehicle['count'] = 1
#driverVehicle.groupby(['medallion', 'hack_license']).agg('sum')
# filter out some rows for testing code
filter = ['00567B1CBFD51DDFAC73359B09238922', '0002555BBE359440D6CEB34B699D3932', '00B7691D86D96AEBD21DD9E138F90840']
#driverVehicle[driverVehicle['hack_license'].isin(filter)].groupby(['hack_license', 'medallion']).agg('count')
# filter out some rows for testing code
#driverVehicle[['hack_license', 'medallion']][driverVehicle['hack_license'].isin(filter)].groupby(['hack_license'])['medallion'].nunique()
#driverVehicle[driverVehicle['hack_license'].isin(filter)].groupby(['hack_license', 'medallion'], as_index=False).agg('sum')
driverTaxiJourneys = driverVehicle.groupby(['hack_license', 'medallion'], as_index=False).agg('sum')
driverTaxiJourneys.head()
driverTaxiJourneys.columns = ['Driver', 'Taxi', 'Fare Amount', 'Tip Amount', 'Total Amount', 'Trip Time (sec)', 'Trip Count',]
# calculate the number of taxis jounreys used by a driver in the data (in this case a month)
# output of value counts is a series so need to convert to frame and then reset the index to bring the driver into a column
driverInfo = driverTaxiJourneys['Driver'].value_counts().to_frame().reset_index()
driverInfo.columns = ['Driver', '# of Taxis']
driverInfo.head()
# add in the total number of trips the driver has made this month
# this is done inline
driverInfo = pd.merge(driverInfo, \
driverTaxiJourneys.groupby(['Driver'], as_index=False).agg(["sum", "mean"]), \
how='left', \
left_on = ['Driver'], \
right_on = ['Driver'] )
driverInfo.head()
driverInfo.columns = ['Driver', '# of Taxis', 'Fare Amount (sum)', 'Fare Amount (mean)', 'Tip Amount (sum)', 'Tip Amount (mean)', 'Total Amount (sum)', 'Total Amount (mean)', 'Trip Time (sec) (sum)', 'Trip Time (sec) (mean)', 'Trip Count (sum)','Trip Count (mean)']
driverInfo['Trips Per Taxi'] = driverInfo['Trip Count (sum)'] / driverInfo['# of Taxis']
driverInfo['Fare Per Trip'] = driverInfo['Fare Amount (sum)'] / driverInfo['Trip Count (sum)']
driverInfo['Tip Per Trip'] = driverInfo['Tip Amount (sum)'] / driverInfo['Trip Count (sum)']
driverInfo['Amount Per Trip'] = driverInfo['Total Amount (sum)'] / driverInfo['Trip Count (sum)']
driverInfo['Journey Time (sec) Per Trip'] = driverInfo['Trip Time (sec) (sum)'] / driverInfo['Trip Count (sum)']
driverInfo['Journey Time (mins) Per Trip'] = (driverInfo['Trip Time (sec) (sum)']/60) / driverInfo['Trip Count (sum)']
driverInfo['Trip Time (mins) (sum)'] = (driverInfo['Trip Time (sec) (sum)']/60)
driverInfo['Trip Time (hrs) (sum)'] = (driverInfo['Trip Time (sec) (sum)']/3600)
driverInfo['Trip Time (mins) (mean)'] = (driverInfo['Trip Time (sec) (mean)']/60)
driverInfo.head()
Code to view the counts from collections import Counter hist = Counter(driverInfo['# of Taxis'])
# # look at the number of taxis they use and categorise on that
driverInfo['Taxi Use'] = driverInfo["# of Taxis"].map(lambda x: taxiUse(x))
byField = '# of Taxis'
#plot logarithmic
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Number of Taxis used by a driver in ' + dt, title_font)
driverInfo[byField].plot(kind='hist', bins=driverInfo['# of Taxis'].max(), ax=ax)
ax.set_yscale('log')
ax.set_ylabel("# of Taxis Used (logarithmic)")
if dt == "Jan 2013":
ax.text(250, ax.get_ylim()[0] + ax.get_ylim()[1] /5000, "1 Driver logged 333 different vehicles")
plt.show()
byField = '# of Taxis'
#plot logarithmic
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Number of Taxis used by a driver in ' + dt, title_font)
driverInfo[byField].plot(kind='hist', bins=driverInfo['# of Taxis'].max(), ax=ax)
formatter = FuncFormatter(number)
ax.yaxis.set_major_formatter(formatter)
ax.set_ylabel("# of Taxis Used")
plt.show()
byField = 'Trip Count (sum)'
#plot logarithmic
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Total Number of Trips made by a driver in ' + dt, title_font)
driverInfo[byField].plot(kind='hist', bins=driverInfo[byField].max(), ax=ax)
ax.set_xlabel("Total # of Trips")
if dt == "Jan 2013":
ax.text(3, ax.get_ylim()[0] + ax.get_ylim()[1] /1.2, "%.0f Drivers only made one trip, with an Mean of %.1f mins a trip." \
"\n \
(Min:%.0f, Max:%.0f, SD: %.1f)" % (driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].count(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].mean(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].min(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].max(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].std()))
plt.show()
#Calc for the above chart
print(driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].count())
print(driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].mean())
print(driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].min())
print(driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].mode())
#Counter(driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].astype(int))
print( "%.0f Drivers only made one trip, with an average of %.1f mins a trip." \
"(min:%.1f, Max:%.1f, SD: %.1f)" % (driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].count(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].mean(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].min(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].max(), \
driverInfo['Trip Time (mins) (mean)'][driverInfo[byField]==1].std()))
byField = 'Trips Per Taxi'
#plot logarithmic
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Number of Trips Per Taxi made by a driver in ' + dt, title_font)
driverInfo[byField].plot(kind='hist', bins=driverInfo[byField].astype(int).max(), ax=ax)
ax.set_xlabel("# of Trips Per Taxi")
plt.show()
driverInfo.head()
byField = 'Tip Per Trip'
#plot logarithmic
fig, ax = plt.subplots(figsize=(15,10))
ax.set_title('Tip Per Trips in ' + dt, title_font)
driverInfo[byField].plot(kind='hist', bins=driverInfo[byField].max().astype(int), ax=ax)
ax.set_xlabel("# of Trips Per Taxi")
plt.show()
#plot
alpha = 0.3
fig, ax = plt.subplots(figsize=(15,15))
ax.set_title('Scatter of Mean Journey Time vs Mean Tip Per Trip for ' + dt, title_font)
colors = {"Error":'red', "Single Use":'green', "Low":'blue', "Medium":'yellow', "High":'magenta', "Extreme":'black'}
driverInfo.plot(kind='scatter', y='Tip Per Trip', x='Journey Time (mins) Per Trip',\
c=driverInfo['Taxi Use'].apply(lambda x: colors[x]), \
s=20, ax=ax, lw=0, alpha=alpha)
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
formatter = FuncFormatter(number)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
#plot
alpha = 0.3
filter=["Single Use","Low"]
fig, ax = plt.subplots(figsize=(15,15))
ax.set_title('Scatter of Mean Journey Time vs Mean Tip Per Trip for ' + dt, title_font)
colors = {"Error":'red', "Single Use":'green', "Low":'blue', "Medium":'cyan', "High":'magenta', "Extreme":'black'}
driverInfo[driverInfo['Taxi Use'].isin(filter)].plot(kind='scatter', y='Tip Per Trip', x='Journey Time (mins) Per Trip',\
c=driverInfo['Taxi Use'][driverInfo['Taxi Use'].isin(filter)].apply(lambda x: colors[x]), \
s=20, ax=ax, lw=0, alpha=alpha)
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
formatter = FuncFormatter(number)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
#plot
alpha = 0.3
filter=["Single Use","Low"]
fig, ax = plt.subplots(figsize=(15,15))
ax.set_title('Scatter of Mean Journey Time vs Mean Tip Per Trip for ' + dt, title_font)
colors = {"Error":'red', "Single Use":'green', "Low":'blue', "Medium":'cyan', "High":'magenta', "Extreme":'black'}
driverInfo[~driverInfo['Taxi Use'].isin(filter)].plot(kind='scatter', y='Tip Per Trip', x='Journey Time (mins) Per Trip',\
c=driverInfo['Taxi Use'][~driverInfo['Taxi Use'].isin(filter)].apply(lambda x: colors[x]), \
s=20, ax=ax, lw=0, alpha=alpha)
formatter = FuncFormatter(currency)
ax.yaxis.set_major_formatter(formatter)
formatter = FuncFormatter(number)
ax.xaxis.set_major_formatter(formatter)
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
plt.show()
tripDataFare.columns
tripDataFare = pd.merge(tripDataFare, driverInfo[['Driver', 'Taxi Use']], \
how='left', left_on = ['hack_license'], right_on = ['Driver'])
# dont need the extra column as it is the same
tripDataFare.drop('Driver', axis=1)
tripDataFare.head()
del driverInfo, driverTaxiJourneys, driverVehicle
gc.collect()
del visDataFareRevenue, visDataPassenger, visDataRevenue, visDataTrips, visDataTripsPassengers, \
visDataTripsPassengersRevenue, visDataTripsPassengersRevenueFareRevenue
gc.collect()
def comparisonBoxplots(xaxis, yaxis, palette, title="No Title Supplied", showmeans=True, showfliers=True):
fig = plt.figure(1, figsize=(10,5))
ax = sns.boxplot(x=xaxis, y=yaxis, palette=palette, showmeans=showmeans, showfliers=showfliers)
# set the title of the chart
plt.title(title)
# set the labels - x is blank because we don't need it to say "Summary"
ax.set(xlabel='', ylabel='')
# set the labels on the individual x-axis columns
ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=25)
plt.show()
print('-' * 100)
return
plotValue = ['tip_amount', 'total_amount']
byValue = ['vendor_id', 'payment_type', 'Taxi Use', 'tripLength', 'tripDuration', 'pickup_dayName', \
'pickup_weekdayWeekend', 'pickup_timeOfDay']
for pv in plotValue:
for bv in byValue:
#reorder the df to print nicely
key = tripDataFare[bv].map(retOrderMapping(bv))
tripDataFare = tripDataFare.iloc[key.argsort()]
#print the first chart without outliers
comparisonBoxplots(
tripDataFare[bv],
tripDataFare[pv],
palette=[my_colors[2], my_colors[1]],
title="Box Plot of " + pv,
showmeans=True,
showfliers=False)
# print the second chart without outliers
comparisonBoxplots(
tripDataFare[bv],
tripDataFare[pv],
palette=[my_colors[2], my_colors[1]],
title="Box Plot of " + pv,
showmeans=True,
showfliers=True)
dir()